import pandas as pd
import numpy as np
import datetime as dt
import sys
import warnings
import IPython as ip
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import missingno as msno
# Configuration pour travail avec fichier python "tools" de fonctions
%load_ext autoreload
%aimport tools
#
# Set option
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Recharger les modules pour la conception des fichiers tools
%autoreload 1
warnings.filterwarnings("ignore")
# Import données
data = pd.read_csv('assets/datas/openfoodfacts.csv', sep='\t', low_memory=False)
# Visualisation d'un échantillon de la population
data.sample(5)
| code | url | creator | created_t | created_datetime | last_modified_t | last_modified_datetime | product_name | generic_name | quantity | packaging | packaging_tags | brands | brands_tags | categories | categories_tags | categories_fr | origins | origins_tags | manufacturing_places | manufacturing_places_tags | labels | labels_tags | labels_fr | emb_codes | emb_codes_tags | first_packaging_code_geo | cities | cities_tags | purchase_places | stores | countries | countries_tags | countries_fr | ingredients_text | allergens | allergens_fr | traces | traces_tags | traces_fr | serving_size | no_nutriments | additives_n | additives | additives_tags | additives_fr | ingredients_from_palm_oil_n | ingredients_from_palm_oil | ingredients_from_palm_oil_tags | ingredients_that_may_be_from_palm_oil_n | ingredients_that_may_be_from_palm_oil | ingredients_that_may_be_from_palm_oil_tags | nutrition_grade_uk | nutrition_grade_fr | pnns_groups_1 | pnns_groups_2 | states | states_tags | states_fr | main_category | main_category_fr | image_url | image_small_url | energy_100g | energy-from-fat_100g | fat_100g | saturated-fat_100g | butyric-acid_100g | caproic-acid_100g | caprylic-acid_100g | capric-acid_100g | lauric-acid_100g | myristic-acid_100g | palmitic-acid_100g | stearic-acid_100g | arachidic-acid_100g | behenic-acid_100g | lignoceric-acid_100g | cerotic-acid_100g | montanic-acid_100g | melissic-acid_100g | monounsaturated-fat_100g | polyunsaturated-fat_100g | omega-3-fat_100g | alpha-linolenic-acid_100g | eicosapentaenoic-acid_100g | docosahexaenoic-acid_100g | omega-6-fat_100g | linoleic-acid_100g | arachidonic-acid_100g | gamma-linolenic-acid_100g | dihomo-gamma-linolenic-acid_100g | omega-9-fat_100g | oleic-acid_100g | elaidic-acid_100g | gondoic-acid_100g | mead-acid_100g | erucic-acid_100g | nervonic-acid_100g | trans-fat_100g | cholesterol_100g | carbohydrates_100g | sugars_100g | sucrose_100g | glucose_100g | fructose_100g | lactose_100g | maltose_100g | maltodextrins_100g | starch_100g | polyols_100g | fiber_100g | proteins_100g | casein_100g | serum-proteins_100g | nucleotides_100g | salt_100g | sodium_100g | alcohol_100g | vitamin-a_100g | beta-carotene_100g | vitamin-d_100g | vitamin-e_100g | vitamin-k_100g | vitamin-c_100g | vitamin-b1_100g | vitamin-b2_100g | vitamin-pp_100g | vitamin-b6_100g | vitamin-b9_100g | folates_100g | vitamin-b12_100g | biotin_100g | pantothenic-acid_100g | silica_100g | bicarbonate_100g | potassium_100g | chloride_100g | calcium_100g | phosphorus_100g | iron_100g | magnesium_100g | zinc_100g | copper_100g | manganese_100g | fluoride_100g | selenium_100g | chromium_100g | molybdenum_100g | iodine_100g | caffeine_100g | taurine_100g | ph_100g | fruits-vegetables-nuts_100g | collagen-meat-protein-ratio_100g | cocoa_100g | chlorophyl_100g | carbon-footprint_100g | nutrition-score-fr_100g | nutrition-score-uk_100g | glycemic-index_100g | water-hardness_100g | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 29015 | 0033900100204 | http://world-fr.openfoodfacts.org/produit/0033... | usda-ndb-import | 1489056943 | 2017-03-09T10:55:43Z | 1489056943 | 2017-03-09T10:55:43Z | Hickory Smoked Sliced Bacon | NaN | NaN | NaN | NaN | Jones Dairy Farm | jones-dairy-farm | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | US | en:united-states | États-Unis | Cured with water, salt, sugar, sodium phosphat... | NaN | NaN | NaN | NaN | NaN | 9 g (9 g) | NaN | 3.0 | [ cured-with-water -> en:cured-with-water ] ... | en:e339,en:e301,en:e250 | E339 - Orthophosphates de sodium,E301 - Ascorb... | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | en:to-be-completed, en:nutrition-facts-complet... | en:to-be-completed,en:nutrition-facts-complete... | A compléter,Informations nutritionnelles compl... | NaN | NaN | NaN | NaN | 2326.0 | NaN | 44.44 | 16.67 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.111 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 33.33 | NaN | NaN | NaN | 3.66776 | 1.444 | NaN | NaN | NaN | NaN | NaN | NaN | 0.0133 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 76736 | 0071627075801 | http://world-fr.openfoodfacts.org/produit/0071... | usda-ndb-import | 1489090861 | 2017-03-09T20:21:01Z | 1489090861 | 2017-03-09T20:21:01Z | Kosher Ground White Turkey | NaN | NaN | NaN | NaN | Empire Kosher Poultry Inc | empire-kosher-poultry-inc | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | US | en:united-states | États-Unis | White turkey, natural flavorings. | NaN | NaN | NaN | NaN | NaN | 112 g (4 ONZ) | NaN | 0.0 | [ white-turkey -> en:white-turkey ] [ turke... | NaN | NaN | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | a | NaN | NaN | en:to-be-completed, en:nutrition-facts-complet... | en:to-be-completed,en:nutrition-facts-complete... | A compléter,Informations nutritionnelles compl... | NaN | NaN | NaN | NaN | 598.0 | NaN | 7.14 | 2.23 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.058 | 0.0 | 0.00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 19.64 | NaN | NaN | NaN | 0.28448 | 0.112 | NaN | 0.000027 | NaN | NaN | NaN | NaN | 0.0000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.036 | NaN | 0.00161 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -1.0 | -1.0 | NaN | NaN |
| 164226 | 0856500004822 | http://world-fr.openfoodfacts.org/produit/0856... | usda-ndb-import | 1489133160 | 2017-03-10T08:06:00Z | 1489133161 | 2017-03-10T08:06:01Z | Hope, Organic Guacamole, Spicy Green Chile | NaN | NaN | NaN | NaN | Hope Foods Llc | hope-foods-llc | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | US | en:united-states | États-Unis | Organic avocado, organic green chile, sea salt... | NaN | NaN | NaN | NaN | NaN | 30 g (2 Tbsp) | NaN | 0.0 | [ organic-avocado -> en:organic-avocado ] [... | NaN | NaN | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | a | NaN | NaN | en:to-be-completed, en:nutrition-facts-complet... | en:to-be-completed,en:nutrition-facts-complete... | A compléter,Informations nutritionnelles compl... | NaN | NaN | NaN | NaN | 556.0 | NaN | 13.33 | 1.67 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.000 | 10.0 | 0.00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6.7 | 3.33 | NaN | NaN | NaN | 0.88900 | 0.350 | NaN | 0.000000 | NaN | NaN | NaN | NaN | 0.0200 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.000 | NaN | 0.00000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -2.0 | -2.0 | NaN | NaN |
| 255589 | 3770006344060 | http://world-fr.openfoodfacts.org/produit/3770... | teolemon | 1489081881 | 2017-03-09T17:51:21Z | 1489081893 | 2017-03-09T17:51:33Z | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en:FR | en:france | France | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en:to-be-completed, en:nutrition-facts-to-be-c... | en:to-be-completed,en:nutrition-facts-to-be-co... | A compléter,Informations nutritionnelles à com... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 128654 | 07078442012 | http://world-fr.openfoodfacts.org/produit/0707... | usda-ndb-import | 1489140246 | 2017-03-10T10:04:06Z | 1489140246 | 2017-03-10T10:04:06Z | Stuffing Mix | NaN | NaN | NaN | NaN | Tops | tops | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | US | en:united-states | États-Unis | Enriched flour (wheat flour, reduced iron, nia... | NaN | NaN | NaN | NaN | NaN | 28 g (0.167 BOX, MAKES ABOUT 0.5 CUP PREPARED) | NaN | 2.0 | [ enriched-flour -> en:enriched-flour ] [ f... | en:e375,en:e101 | E375 - Acide nicotinique,E101 - Riboflavine | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | c | NaN | NaN | en:to-be-completed, en:nutrition-facts-complet... | en:to-be-completed,en:nutrition-facts-complete... | A compléter,Informations nutritionnelles compl... | NaN | NaN | NaN | NaN | 1644.0 | NaN | 3.57 | 0.00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.000 | 75.0 | 7.14 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.6 | 10.71 | NaN | NaN | NaN | 3.53822 | 1.393 | NaN | 0.000000 | NaN | NaN | NaN | NaN | 0.0000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.000 | NaN | 0.00386 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10.0 | 10.0 | NaN | NaN |
# Création d'un df de travail
df = data.copy()
# On affiche nos types de variable en faisant appel à la fonction crée dans tools
tools.get_types_variables(df,True,False,False)
------------------------------------------------------------- Type de variable pour chacune des variables
code object url object creator object created_t object created_datetime object last_modified_t object last_modified_datetime object product_name object generic_name object quantity object packaging object packaging_tags object brands object brands_tags object categories object categories_tags object categories_fr object origins object origins_tags object manufacturing_places object manufacturing_places_tags object labels object labels_tags object labels_fr object emb_codes object emb_codes_tags object first_packaging_code_geo object cities object cities_tags object purchase_places object stores object countries object countries_tags object countries_fr object ingredients_text object allergens object allergens_fr object traces object traces_tags object traces_fr object serving_size object no_nutriments float64 additives_n float64 additives object additives_tags object additives_fr object ingredients_from_palm_oil_n float64 ingredients_from_palm_oil float64 ingredients_from_palm_oil_tags object ingredients_that_may_be_from_palm_oil_n float64 ingredients_that_may_be_from_palm_oil float64 ingredients_that_may_be_from_palm_oil_tags object nutrition_grade_uk float64 nutrition_grade_fr object pnns_groups_1 object pnns_groups_2 object states object states_tags object states_fr object main_category object main_category_fr object image_url object image_small_url object energy_100g float64 energy-from-fat_100g float64 fat_100g float64 saturated-fat_100g float64 butyric-acid_100g float64 caproic-acid_100g float64 caprylic-acid_100g float64 capric-acid_100g float64 lauric-acid_100g float64 myristic-acid_100g float64 palmitic-acid_100g float64 stearic-acid_100g float64 arachidic-acid_100g float64 behenic-acid_100g float64 lignoceric-acid_100g float64 cerotic-acid_100g float64 montanic-acid_100g float64 melissic-acid_100g float64 monounsaturated-fat_100g float64 polyunsaturated-fat_100g float64 omega-3-fat_100g float64 alpha-linolenic-acid_100g float64 eicosapentaenoic-acid_100g float64 docosahexaenoic-acid_100g float64 omega-6-fat_100g float64 linoleic-acid_100g float64 arachidonic-acid_100g float64 gamma-linolenic-acid_100g float64 dihomo-gamma-linolenic-acid_100g float64 omega-9-fat_100g float64 oleic-acid_100g float64 elaidic-acid_100g float64 gondoic-acid_100g float64 mead-acid_100g float64 erucic-acid_100g float64 nervonic-acid_100g float64 trans-fat_100g float64 cholesterol_100g float64 carbohydrates_100g float64 sugars_100g float64 sucrose_100g float64 glucose_100g float64 fructose_100g float64 lactose_100g float64 maltose_100g float64 maltodextrins_100g float64 starch_100g float64 polyols_100g float64 fiber_100g float64 proteins_100g float64 casein_100g float64 serum-proteins_100g float64 nucleotides_100g float64 salt_100g float64 sodium_100g float64 alcohol_100g float64 vitamin-a_100g float64 beta-carotene_100g float64 vitamin-d_100g float64 vitamin-e_100g float64 vitamin-k_100g float64 vitamin-c_100g float64 vitamin-b1_100g float64 vitamin-b2_100g float64 vitamin-pp_100g float64 vitamin-b6_100g float64 vitamin-b9_100g float64 folates_100g float64 vitamin-b12_100g float64 biotin_100g float64 pantothenic-acid_100g float64 silica_100g float64 bicarbonate_100g float64 potassium_100g float64 chloride_100g float64 calcium_100g float64 phosphorus_100g float64 iron_100g float64 magnesium_100g float64 zinc_100g float64 copper_100g float64 manganese_100g float64 fluoride_100g float64 selenium_100g float64 chromium_100g float64 molybdenum_100g float64 iodine_100g float64 caffeine_100g float64 taurine_100g float64 ph_100g float64 fruits-vegetables-nuts_100g float64 collagen-meat-protein-ratio_100g float64 cocoa_100g float64 chlorophyl_100g float64 carbon-footprint_100g float64 nutrition-score-fr_100g float64 nutrition-score-uk_100g float64 glycemic-index_100g float64 water-hardness_100g float64 dtype: object
# # On initialise les variables qui suivrony les types
float_columns = df.select_dtypes(include=['float64']).columns
object_columns = df.select_dtypes(include=['object']).columns
datetime_columns = df.select_dtypes(include=['datetime64']).columns
# On crée une fonction pour le suivi
def get_infos_df(df,with_types) :
float_columns = df.select_dtypes(include=['float64']).columns
object_columns = df.select_dtypes(include=['object']).columns
datetime_columns = df.select_dtypes(include=['datetime64[ns]']).columns
print("--------------GENERAL----------------------------------")
print(f"taille df : {df.shape}")
print("")
if with_types:
print("--------------OBJECT----------------------------------")
print(f"Nombre de colonne du type Object : {df[object_columns].shape[1]}")
print("---------->")
print(f"{object_columns.to_list()}")
print("")
print("--------------FLOAT-----------------------------------")
print(f"Nombre de colonne du type Float : {df[float_columns].shape[1]}")
print("---------->")
print(f"{float_columns.to_list()}")
print("")
print("--------------datetime64[ns]-----------------------------------")
print(f"Nombre de colonne du type datetime : {df[datetime_columns].shape[1]}")
print("---------->")
print(f"{datetime_columns.to_list()}")
# On supprime les erreurs
df.loc[df['created_datetime']=='Dia,Sogeres','created_datetime']='NaN'
df.loc[df['created_datetime']=='SuperU','created_datetime']='NaN'
df.loc[df['created_datetime']=='Carrefour','created_datetime']='NaN'
df.loc[df['created_datetime']=='Auchan','created_datetime']='NaN'
df.loc[df['created_datetime']=='Carrefour market','created_datetime']='NaN'
df.loc[df['created_datetime']=='SuperU,Leclerc','created_datetime']='NaN'
df.loc[df['created_datetime']=='Spar','created_datetime']='NaN'
df.loc[df['created_datetime']=='Banque alimentaire','created_datetime']='NaN'
df.loc[df['created_datetime']=='Leclerc','created_datetime']='NaN'
df.loc[df['created_datetime']=='Auchan,Super U','created_datetime']='NaN'
df.loc[df['created_datetime']=='Super U,Leclerc,Carrefour','created_datetime']='NaN'
df.loc[df['created_datetime']=='Cora','created_datetime']='NaN'
df.loc[df['created_datetime']=='Super U','created_datetime']='NaN'
df.loc[df['created_datetime']=='Super U,Leclerc,Carrefour','created_datetime']='NaN'
# On supprime les erreurs
df.loc[df['created_t']=='France','created_t']='NaN'
df.loc[df['created_t']=='Villecresnes,Villeurbanne,France','created_t']='NaN'
df.loc[df['created_t']=='Veynes,France','created_t']='NaN'
df.loc[df['created_t']=='Saint-Priest,France','created_t']='NaN'
df.loc[df['created_t']=='Torcy 77,France','created_t']='NaN'
df.loc[df['created_t']=='Villers Bocage 80260,France','created_t']='NaN'
df.loc[df['created_t']=='Montgermont','created_t']='NaN'
df.loc[df['created_t']=='Marseille 5°,France','created_t']='NaN'
df.loc[df['created_t']=='France,Nantes,Carquefou','created_t']='NaN'
df.loc[df['created_t']=='Courrières,France','created_t']='NaN'
df.loc[df['created_t']=='Brétigny-sur-Orge,Marseille 5°','created_t']='NaN'
df.loc[df['created_t']=='Brétigny-sur-Orge,Marseille 5°,France','created_t']='NaN'
df.loc[df['created_t']=='France','created_t']='NaN'
# On supprime les erreurs
df.loc[df['last_modified_t']=='Belgique,France','last_modified_t']='NaN'
df.loc[df['last_modified_t']=='France','last_modified_t']='NaN'
df.loc[df['last_modified_t']=='Belgique,France, en:switzerland','last_modified_t']='NaN'
df.loc[df['last_modified_t']=='Suisse,France','last_modified_t']='NaN'
# On supprime les erreurs de saisie
df.loc[df['last_modified_datetime']=='en:belgium,en:france','last_modified_datetime']='NaN'
df.loc[df['last_modified_datetime']=='en:france','last_modified_datetime']='NaN'
df.loc[df['last_modified_datetime']=='en:belgium,en:france,en:switzerland','last_modified_datetime']='NaN'
df.loc[df['last_modified_datetime']=='en:france,en:switzerland','last_modified_datetime']='NaN'
# On convertit la colonne last_modified_dattetime au format date
df["last_modified_datetime"] = pd.to_datetime(df["last_modified_datetime"], format="%Y-%m-%dT%H:%M:%SZ")
# On convertit la colonne last_modified_t au format date
df['last_modified_t'] = pd.to_datetime(df['last_modified_t'],unit='s')
# On convertit la colonne created_t au format date
df['created_t'] = pd.to_datetime(df['created_t'],unit='s')
# On convertit la colonne created_datetime au format date
df["created_datetime"] = pd.to_datetime(df["created_datetime"], format="%Y-%m-%dT%H:%M:%SZ")
# On met à jour les variables
get_infos_df(df,with_types=True)
--------------GENERAL---------------------------------- taille df : (320772, 162) --------------OBJECT---------------------------------- Nombre de colonne du type Object : 52 ----------> ['code', 'url', 'creator', 'product_name', 'generic_name', 'quantity', 'packaging', 'packaging_tags', 'brands', 'brands_tags', 'categories', 'categories_tags', 'categories_fr', 'origins', 'origins_tags', 'manufacturing_places', 'manufacturing_places_tags', 'labels', 'labels_tags', 'labels_fr', 'emb_codes', 'emb_codes_tags', 'first_packaging_code_geo', 'cities', 'cities_tags', 'purchase_places', 'stores', 'countries', 'countries_tags', 'countries_fr', 'ingredients_text', 'allergens', 'allergens_fr', 'traces', 'traces_tags', 'traces_fr', 'serving_size', 'additives', 'additives_tags', 'additives_fr', 'ingredients_from_palm_oil_tags', 'ingredients_that_may_be_from_palm_oil_tags', 'nutrition_grade_fr', 'pnns_groups_1', 'pnns_groups_2', 'states', 'states_tags', 'states_fr', 'main_category', 'main_category_fr', 'image_url', 'image_small_url'] --------------FLOAT----------------------------------- Nombre de colonne du type Float : 106 ----------> ['no_nutriments', 'additives_n', 'ingredients_from_palm_oil_n', 'ingredients_from_palm_oil', 'ingredients_that_may_be_from_palm_oil_n', 'ingredients_that_may_be_from_palm_oil', 'nutrition_grade_uk', 'energy_100g', 'energy-from-fat_100g', 'fat_100g', 'saturated-fat_100g', 'butyric-acid_100g', 'caproic-acid_100g', 'caprylic-acid_100g', 'capric-acid_100g', 'lauric-acid_100g', 'myristic-acid_100g', 'palmitic-acid_100g', 'stearic-acid_100g', 'arachidic-acid_100g', 'behenic-acid_100g', 'lignoceric-acid_100g', 'cerotic-acid_100g', 'montanic-acid_100g', 'melissic-acid_100g', 'monounsaturated-fat_100g', 'polyunsaturated-fat_100g', 'omega-3-fat_100g', 'alpha-linolenic-acid_100g', 'eicosapentaenoic-acid_100g', 'docosahexaenoic-acid_100g', 'omega-6-fat_100g', 'linoleic-acid_100g', 'arachidonic-acid_100g', 'gamma-linolenic-acid_100g', 'dihomo-gamma-linolenic-acid_100g', 'omega-9-fat_100g', 'oleic-acid_100g', 'elaidic-acid_100g', 'gondoic-acid_100g', 'mead-acid_100g', 'erucic-acid_100g', 'nervonic-acid_100g', 'trans-fat_100g', 'cholesterol_100g', 'carbohydrates_100g', 'sugars_100g', 'sucrose_100g', 'glucose_100g', 'fructose_100g', 'lactose_100g', 'maltose_100g', 'maltodextrins_100g', 'starch_100g', 'polyols_100g', 'fiber_100g', 'proteins_100g', 'casein_100g', 'serum-proteins_100g', 'nucleotides_100g', 'salt_100g', 'sodium_100g', 'alcohol_100g', 'vitamin-a_100g', 'beta-carotene_100g', 'vitamin-d_100g', 'vitamin-e_100g', 'vitamin-k_100g', 'vitamin-c_100g', 'vitamin-b1_100g', 'vitamin-b2_100g', 'vitamin-pp_100g', 'vitamin-b6_100g', 'vitamin-b9_100g', 'folates_100g', 'vitamin-b12_100g', 'biotin_100g', 'pantothenic-acid_100g', 'silica_100g', 'bicarbonate_100g', 'potassium_100g', 'chloride_100g', 'calcium_100g', 'phosphorus_100g', 'iron_100g', 'magnesium_100g', 'zinc_100g', 'copper_100g', 'manganese_100g', 'fluoride_100g', 'selenium_100g', 'chromium_100g', 'molybdenum_100g', 'iodine_100g', 'caffeine_100g', 'taurine_100g', 'ph_100g', 'fruits-vegetables-nuts_100g', 'collagen-meat-protein-ratio_100g', 'cocoa_100g', 'chlorophyl_100g', 'carbon-footprint_100g', 'nutrition-score-fr_100g', 'nutrition-score-uk_100g', 'glycemic-index_100g', 'water-hardness_100g'] --------------datetime64[ns]----------------------------------- Nombre de colonne du type datetime : 4 ----------> ['created_t', 'created_datetime', 'last_modified_t', 'last_modified_datetime']
# On visualise les variables date
datetime_columns = df.select_dtypes(include=['datetime64[ns]']).columns
df[datetime_columns].head()
| created_t | created_datetime | last_modified_t | last_modified_datetime | |
|---|---|---|---|---|
| 0 | 2016-09-17 09:17:46 | 2016-09-17 09:17:46 | 2016-09-17 09:18:13 | 2016-09-17 09:18:13 |
| 1 | 2017-03-09 14:32:37 | 2017-03-09 14:32:37 | 2017-03-09 14:32:37 | 2017-03-09 14:32:37 |
| 2 | 2017-03-09 14:32:37 | 2017-03-09 14:32:37 | 2017-03-09 14:32:37 | 2017-03-09 14:32:37 |
| 3 | 2017-03-09 10:35:31 | 2017-03-09 10:35:31 | 2017-03-09 10:35:31 | 2017-03-09 10:35:31 |
| 4 | 2017-03-09 10:34:13 | 2017-03-09 10:34:13 | 2017-03-09 10:34:13 | 2017-03-09 10:34:13 |
# On visialise la répartition de la création des dates
plt.figure(figsize=(15, 15), dpi=100) # taille
plt.subplot(4, 2, 1)
add_per_year = df[['created_t', 'code']].groupby(by=df['created_t'].dt.year).nunique()
sns.set_style("whitegrid")
sns.barplot(data=add_per_year, x=add_per_year.index, y='code', color='#00afe6')
plt.title("created_t : Evolution des créations de produits dans la base par année",fontweight='bold')
plt.xlabel("Année de création")
plt.ylabel("Nombre de créations")
plt.subplot(4, 2, 2)
add_per_year = df[['created_datetime', 'code']].groupby(by=df['created_datetime'].dt.year).nunique()
sns.set_style("whitegrid")
sns.barplot(data=add_per_year, x=add_per_year.index, y='code', color='#00afe6')
plt.title("created_datetime : Evolution des créations de produits dans la base par année",fontweight='bold')
plt.xlabel("Année de création")
plt.ylabel("Nombre de créations")
plt.subplot(4, 2, 3)
add_per_year = df[['last_modified_t', 'code']].groupby(by=df['last_modified_t'].dt.year).nunique()
sns.set_style("whitegrid")
sns.barplot(data=add_per_year, x=add_per_year.index, y='code', color='#00afe6')
plt.title("last_modified_t : Evolution des créations de produits dans la base par année",fontweight='bold')
plt.xlabel("Année de création")
plt.ylabel("Nombre de créations")
plt.subplot(4, 2, 4)
add_per_year = df[['last_modified_datetime', 'code']].groupby(by=df['last_modified_datetime'].dt.year).nunique()
sns.set_style("whitegrid")
sns.barplot(data=add_per_year, x=add_per_year.index, y='code', color='#00afe6')
plt.title("last_modified_datetime : Evolution des créations de produits dans la base par année",fontweight='bold')
plt.xlabel("Année de création")
plt.ylabel("Nombre de créations")
plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=1, hspace=0.35)
plt.show()
# On visualise la répartition des types
tools.get_types_variables(df,True,True,True)
------------------------------------------------------------- Type de variable pour chacune des variables
code object url object creator object created_t datetime64[ns] created_datetime datetime64[ns] last_modified_t datetime64[ns] last_modified_datetime datetime64[ns] product_name object generic_name object quantity object packaging object packaging_tags object brands object brands_tags object categories object categories_tags object categories_fr object origins object origins_tags object manufacturing_places object manufacturing_places_tags object labels object labels_tags object labels_fr object emb_codes object emb_codes_tags object first_packaging_code_geo object cities object cities_tags object purchase_places object stores object countries object countries_tags object countries_fr object ingredients_text object allergens object allergens_fr object traces object traces_tags object traces_fr object serving_size object no_nutriments float64 additives_n float64 additives object additives_tags object additives_fr object ingredients_from_palm_oil_n float64 ingredients_from_palm_oil float64 ingredients_from_palm_oil_tags object ingredients_that_may_be_from_palm_oil_n float64 ingredients_that_may_be_from_palm_oil float64 ingredients_that_may_be_from_palm_oil_tags object nutrition_grade_uk float64 nutrition_grade_fr object pnns_groups_1 object pnns_groups_2 object states object states_tags object states_fr object main_category object main_category_fr object image_url object image_small_url object energy_100g float64 energy-from-fat_100g float64 fat_100g float64 saturated-fat_100g float64 butyric-acid_100g float64 caproic-acid_100g float64 caprylic-acid_100g float64 capric-acid_100g float64 lauric-acid_100g float64 myristic-acid_100g float64 palmitic-acid_100g float64 stearic-acid_100g float64 arachidic-acid_100g float64 behenic-acid_100g float64 lignoceric-acid_100g float64 cerotic-acid_100g float64 montanic-acid_100g float64 melissic-acid_100g float64 monounsaturated-fat_100g float64 polyunsaturated-fat_100g float64 omega-3-fat_100g float64 alpha-linolenic-acid_100g float64 eicosapentaenoic-acid_100g float64 docosahexaenoic-acid_100g float64 omega-6-fat_100g float64 linoleic-acid_100g float64 arachidonic-acid_100g float64 gamma-linolenic-acid_100g float64 dihomo-gamma-linolenic-acid_100g float64 omega-9-fat_100g float64 oleic-acid_100g float64 elaidic-acid_100g float64 gondoic-acid_100g float64 mead-acid_100g float64 erucic-acid_100g float64 nervonic-acid_100g float64 trans-fat_100g float64 cholesterol_100g float64 carbohydrates_100g float64 sugars_100g float64 sucrose_100g float64 glucose_100g float64 fructose_100g float64 lactose_100g float64 maltose_100g float64 maltodextrins_100g float64 starch_100g float64 polyols_100g float64 fiber_100g float64 proteins_100g float64 casein_100g float64 serum-proteins_100g float64 nucleotides_100g float64 salt_100g float64 sodium_100g float64 alcohol_100g float64 vitamin-a_100g float64 beta-carotene_100g float64 vitamin-d_100g float64 vitamin-e_100g float64 vitamin-k_100g float64 vitamin-c_100g float64 vitamin-b1_100g float64 vitamin-b2_100g float64 vitamin-pp_100g float64 vitamin-b6_100g float64 vitamin-b9_100g float64 folates_100g float64 vitamin-b12_100g float64 biotin_100g float64 pantothenic-acid_100g float64 silica_100g float64 bicarbonate_100g float64 potassium_100g float64 chloride_100g float64 calcium_100g float64 phosphorus_100g float64 iron_100g float64 magnesium_100g float64 zinc_100g float64 copper_100g float64 manganese_100g float64 fluoride_100g float64 selenium_100g float64 chromium_100g float64 molybdenum_100g float64 iodine_100g float64 caffeine_100g float64 taurine_100g float64 ph_100g float64 fruits-vegetables-nuts_100g float64 collagen-meat-protein-ratio_100g float64 cocoa_100g float64 chlorophyl_100g float64 carbon-footprint_100g float64 nutrition-score-fr_100g float64 nutrition-score-uk_100g float64 glycemic-index_100g float64 water-hardness_100g float64 dtype: object
---------------------------------------------------------- Répartition des types de variable
| Nombre par type de variable | % des types de variable | |
|---|---|---|
| float64 | 106 | 65.430000 |
| object | 52 | 32.100000 |
| datetime64[ns] | 4 | 2.470000 |
# Probable erreur de saisie
df.loc[df['created_t']=='01-01-1970']
| code | url | creator | created_t | created_datetime | last_modified_t | last_modified_datetime | product_name | generic_name | quantity | packaging | packaging_tags | brands | brands_tags | categories | categories_tags | categories_fr | origins | origins_tags | manufacturing_places | manufacturing_places_tags | labels | labels_tags | labels_fr | emb_codes | emb_codes_tags | first_packaging_code_geo | cities | cities_tags | purchase_places | stores | countries | countries_tags | countries_fr | ingredients_text | allergens | allergens_fr | traces | traces_tags | traces_fr | serving_size | no_nutriments | additives_n | additives | additives_tags | additives_fr | ingredients_from_palm_oil_n | ingredients_from_palm_oil | ingredients_from_palm_oil_tags | ingredients_that_may_be_from_palm_oil_n | ingredients_that_may_be_from_palm_oil | ingredients_that_may_be_from_palm_oil_tags | nutrition_grade_uk | nutrition_grade_fr | pnns_groups_1 | pnns_groups_2 | states | states_tags | states_fr | main_category | main_category_fr | image_url | image_small_url | energy_100g | energy-from-fat_100g | fat_100g | saturated-fat_100g | butyric-acid_100g | caproic-acid_100g | caprylic-acid_100g | capric-acid_100g | lauric-acid_100g | myristic-acid_100g | palmitic-acid_100g | stearic-acid_100g | arachidic-acid_100g | behenic-acid_100g | lignoceric-acid_100g | cerotic-acid_100g | montanic-acid_100g | melissic-acid_100g | monounsaturated-fat_100g | polyunsaturated-fat_100g | omega-3-fat_100g | alpha-linolenic-acid_100g | eicosapentaenoic-acid_100g | docosahexaenoic-acid_100g | omega-6-fat_100g | linoleic-acid_100g | arachidonic-acid_100g | gamma-linolenic-acid_100g | dihomo-gamma-linolenic-acid_100g | omega-9-fat_100g | oleic-acid_100g | elaidic-acid_100g | gondoic-acid_100g | mead-acid_100g | erucic-acid_100g | nervonic-acid_100g | trans-fat_100g | cholesterol_100g | carbohydrates_100g | sugars_100g | sucrose_100g | glucose_100g | fructose_100g | lactose_100g | maltose_100g | maltodextrins_100g | starch_100g | polyols_100g | fiber_100g | proteins_100g | casein_100g | serum-proteins_100g | nucleotides_100g | salt_100g | sodium_100g | alcohol_100g | vitamin-a_100g | beta-carotene_100g | vitamin-d_100g | vitamin-e_100g | vitamin-k_100g | vitamin-c_100g | vitamin-b1_100g | vitamin-b2_100g | vitamin-pp_100g | vitamin-b6_100g | vitamin-b9_100g | folates_100g | vitamin-b12_100g | biotin_100g | pantothenic-acid_100g | silica_100g | bicarbonate_100g | potassium_100g | chloride_100g | calcium_100g | phosphorus_100g | iron_100g | magnesium_100g | zinc_100g | copper_100g | manganese_100g | fluoride_100g | selenium_100g | chromium_100g | molybdenum_100g | iodine_100g | caffeine_100g | taurine_100g | ph_100g | fruits-vegetables-nuts_100g | collagen-meat-protein-ratio_100g | cocoa_100g | chlorophyl_100g | carbon-footprint_100g | nutrition-score-fr_100g | nutrition-score-uk_100g | glycemic-index_100g | water-hardness_100g | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 187478 | 3017760038409 | http://world-fr.openfoodfacts.org/produit/3017... | marianne | 1970-01-01 | NaT | 2017-03-08 16:54:15 | 2017-03-08 16:54:15 | Lulu la barquette (Fraise) | Génoise garnie à la purée de fraise | 120 g | Paquet,Carton | paquet,carton | LU | lu | Snacks sucrés,Biscuits et gâteaux,Biscuits,Bis... | en:sugary-snacks,en:biscuits-and-cakes,en:bisc... | Snacks sucrés,Biscuits et gâteaux,Biscuits,Bar... | NaN | NaN | France,Ardennes | france,ardennes | Sans colorants,Sans conservateurs | en:no-colorings,en:no-preservatives | Sans colorants,Sans conservateurs | NaN | NaN | NaN | NaN | NaN | Courrières,France | Cora | France,Suisse | en:france,en:switzerland | France,Suisse | Sirop de glucose-fructose, purée de fraises 27... | blé, œufs | NaN | Lait,Fruits à coque | en:milk,en:nuts | Lait,Fruits à coque | Un biscuit (6,66g) | NaN | 2.0 | [ sirop-de-glucose-fructose -> fr:sirop-de-gl... | en:e440,en:e330 | E440 - Pectines,E330 - Acide citrique | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | d | Sugary snacks | Biscuits and cakes | en:to-be-checked, en:complete, en:nutrition-fa... | en:to-be-checked,en:complete,en:nutrition-fact... | A vérifier,Complet,Informations nutritionnelle... | en:biscuits | Biscuits | http://fr.openfoodfacts.org/images/products/30... | http://fr.openfoodfacts.org/images/products/30... | 1480.0 | NaN | 2.1 | 0.5 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 78.0 | 60.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.2 | 4.2 | NaN | NaN | NaN | 0.08 | 0.031496 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 13.0 | 13.0 | NaN | NaN |
# On lui affecte la date de de last_modified_t
df.iloc[187478,3]= df.iloc[187478,6]
df.iloc[187478,4]= df.iloc[187478,6]
df.iloc[187478,:7]
code 3017760038409 url http://world-fr.openfoodfacts.org/produit/3017... creator marianne created_t 2017-03-08 16:54:15 created_datetime 2017-03-08 16:54:15 last_modified_t 2017-03-08 16:54:15 last_modified_datetime 2017-03-08 16:54:15 Name: 187478, dtype: object
<div class="alert alert-block alert-info",fontweigth="bold"> INFORMATIONS GENERALES SUR LE JEU DE DONNEES </div>
# On affiche les listes par type
get_infos_df(df,with_types=True)
--------------GENERAL---------------------------------- taille df : (320772, 162) --------------OBJECT---------------------------------- Nombre de colonne du type Object : 52 ----------> ['code', 'url', 'creator', 'product_name', 'generic_name', 'quantity', 'packaging', 'packaging_tags', 'brands', 'brands_tags', 'categories', 'categories_tags', 'categories_fr', 'origins', 'origins_tags', 'manufacturing_places', 'manufacturing_places_tags', 'labels', 'labels_tags', 'labels_fr', 'emb_codes', 'emb_codes_tags', 'first_packaging_code_geo', 'cities', 'cities_tags', 'purchase_places', 'stores', 'countries', 'countries_tags', 'countries_fr', 'ingredients_text', 'allergens', 'allergens_fr', 'traces', 'traces_tags', 'traces_fr', 'serving_size', 'additives', 'additives_tags', 'additives_fr', 'ingredients_from_palm_oil_tags', 'ingredients_that_may_be_from_palm_oil_tags', 'nutrition_grade_fr', 'pnns_groups_1', 'pnns_groups_2', 'states', 'states_tags', 'states_fr', 'main_category', 'main_category_fr', 'image_url', 'image_small_url'] --------------FLOAT----------------------------------- Nombre de colonne du type Float : 106 ----------> ['no_nutriments', 'additives_n', 'ingredients_from_palm_oil_n', 'ingredients_from_palm_oil', 'ingredients_that_may_be_from_palm_oil_n', 'ingredients_that_may_be_from_palm_oil', 'nutrition_grade_uk', 'energy_100g', 'energy-from-fat_100g', 'fat_100g', 'saturated-fat_100g', 'butyric-acid_100g', 'caproic-acid_100g', 'caprylic-acid_100g', 'capric-acid_100g', 'lauric-acid_100g', 'myristic-acid_100g', 'palmitic-acid_100g', 'stearic-acid_100g', 'arachidic-acid_100g', 'behenic-acid_100g', 'lignoceric-acid_100g', 'cerotic-acid_100g', 'montanic-acid_100g', 'melissic-acid_100g', 'monounsaturated-fat_100g', 'polyunsaturated-fat_100g', 'omega-3-fat_100g', 'alpha-linolenic-acid_100g', 'eicosapentaenoic-acid_100g', 'docosahexaenoic-acid_100g', 'omega-6-fat_100g', 'linoleic-acid_100g', 'arachidonic-acid_100g', 'gamma-linolenic-acid_100g', 'dihomo-gamma-linolenic-acid_100g', 'omega-9-fat_100g', 'oleic-acid_100g', 'elaidic-acid_100g', 'gondoic-acid_100g', 'mead-acid_100g', 'erucic-acid_100g', 'nervonic-acid_100g', 'trans-fat_100g', 'cholesterol_100g', 'carbohydrates_100g', 'sugars_100g', 'sucrose_100g', 'glucose_100g', 'fructose_100g', 'lactose_100g', 'maltose_100g', 'maltodextrins_100g', 'starch_100g', 'polyols_100g', 'fiber_100g', 'proteins_100g', 'casein_100g', 'serum-proteins_100g', 'nucleotides_100g', 'salt_100g', 'sodium_100g', 'alcohol_100g', 'vitamin-a_100g', 'beta-carotene_100g', 'vitamin-d_100g', 'vitamin-e_100g', 'vitamin-k_100g', 'vitamin-c_100g', 'vitamin-b1_100g', 'vitamin-b2_100g', 'vitamin-pp_100g', 'vitamin-b6_100g', 'vitamin-b9_100g', 'folates_100g', 'vitamin-b12_100g', 'biotin_100g', 'pantothenic-acid_100g', 'silica_100g', 'bicarbonate_100g', 'potassium_100g', 'chloride_100g', 'calcium_100g', 'phosphorus_100g', 'iron_100g', 'magnesium_100g', 'zinc_100g', 'copper_100g', 'manganese_100g', 'fluoride_100g', 'selenium_100g', 'chromium_100g', 'molybdenum_100g', 'iodine_100g', 'caffeine_100g', 'taurine_100g', 'ph_100g', 'fruits-vegetables-nuts_100g', 'collagen-meat-protein-ratio_100g', 'cocoa_100g', 'chlorophyl_100g', 'carbon-footprint_100g', 'nutrition-score-fr_100g', 'nutrition-score-uk_100g', 'glycemic-index_100g', 'water-hardness_100g'] --------------datetime64[ns]----------------------------------- Nombre de colonne du type datetime : 4 ----------> ['created_t', 'created_datetime', 'last_modified_t', 'last_modified_datetime']
# Répartition du pourcentage de valeurs manquantes pour toutes les variables provisoirement conservées
plt.figure(figsize=(40, 15))
df_nan = round((df.isna().sum()/df.shape[0])*100, 2)
df_nan.sort_values(inplace=True)
df_nan.plot(kind='bar', color='SteelBlue')
plt.title('Répartition du pourcentage de valeurs manquantes par variables provisoirement conservées', fontsize=35)
plt.ylabel('% Nan', fontsize=30)
plt.axhline(y=80, color='r')
# plt.axhline(y=70, color='b')
# plt.axhline(y=75, color='r')
plt.grid(False)
plt.text(2, 82, '>80% nan - A supprimer', color='r', fontsize=30)
# plt.text(2, 72, '>70% nan - ', color='b', fontsize=30)
# plt.text(22, 76, '>75% nan - A supprimer', color='r', fontsize=30)
plt.show()
# On selectionne les colonnes (lister les > 80% de données manquante) avec une fonction créée dans tools 100% onnées manquantes
liste_colonne_supp_80 = tools.get_null_factor(df,80)
# liste_colonne_supp_80
# On sauvegarde notre df (df_complet)
df_complet = df.copy()
# On crée un sous echantillon (df) : on supprime les dolonnes avec trop de données manquantes
df = df.drop(columns=liste_colonne_supp_80['Variables'],axis=1)
# On affiche le nombre de colonnes supprimées
print(f"On a supprimé {df_complet.shape[1] - df.shape[1]} colonnes")
On a supprimé 108 colonnes
tools.get_info_data(df)
------------------------------------------------------------------ Taille du jeu de données Nombre de lignes : 320772 lignes Nombre de colonnes : 54 colonnes ------------------------------------------------------------------
def search_componant(df, suffix='_100g'):
componant = []
for col in df.columns:
if '_100g' in col:
componant.append(col)
df_subset_columns = df[componant]
return df_subset_columns
df_subset_nutients = search_componant(df,'_100g')
df_subset_nutients.head()
| energy_100g | fat_100g | saturated-fat_100g | trans-fat_100g | cholesterol_100g | carbohydrates_100g | sugars_100g | fiber_100g | proteins_100g | salt_100g | sodium_100g | vitamin-a_100g | vitamin-c_100g | calcium_100g | iron_100g | nutrition-score-fr_100g | nutrition-score-uk_100g | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2243.0 | 28.57 | 28.57 | 0.0 | 0.018 | 64.29 | 14.29 | 3.6 | 3.57 | 0.00000 | 0.000 | 0.0 | 0.0214 | 0.000 | 0.00129 | 14.0 | 14.0 |
| 2 | 1941.0 | 17.86 | 0.00 | 0.0 | 0.000 | 60.71 | 17.86 | 7.1 | 17.86 | 0.63500 | 0.250 | 0.0 | 0.0000 | 0.071 | 0.00129 | 0.0 | 0.0 |
| 3 | 2540.0 | 57.14 | 5.36 | NaN | NaN | 17.86 | 3.57 | 7.1 | 17.86 | 1.22428 | 0.482 | NaN | NaN | 0.143 | 0.00514 | 12.0 | 12.0 |
| 4 | 1552.0 | 1.43 | NaN | NaN | NaN | 77.14 | NaN | 5.7 | 8.57 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# Nombre de vide
print(f"Lignes nutriments (_100g) entièrement vides:{df_subset_nutients.isnull().all(axis=1).sum()}")
Lignes nutriments (_100g) entièrement vides:57939
# Nombre de ligne ayant au moins une colonne _100g renseignée)
df = df[df_subset_nutients.notnull().any(axis=1)]
df.shape
(262833, 54)
data.shape[0] - df.shape[0]
57939
# On crée une sauvegarde
df_complet = df.copy()
# On compare les colonnes qui enregistrent les pays
countries_columns = ['countries','countries_tags','countries_fr']
df[df[countries_columns].notnull().any(axis=1)][ countries_columns].sample(5)
| countries | countries_tags | countries_fr | |
|---|---|---|---|
| 169171 | US | en:united-states | États-Unis |
| 183977 | France | en:france | France |
| 98231 | US | en:united-states | États-Unis |
| 1860 | US | en:united-states | États-Unis |
| 159127 | US | en:united-states | États-Unis |
# On comptes le nombre de pays selon les colonnes
print(f"countries : nombre de pays : {df['countries'].nunique()}")
print(f"countries_tags : nombre de pays : {df['countries_tags'].nunique()}")
print(f"countries_fr : nombre de pays : {df['countries_fr'].nunique()}")
countries : nombre de pays : 1044 countries_tags : nombre de pays : 562 countries_fr : nombre de pays : 562
msno.bar(df.iloc[:,16:19])
<AxesSubplot: >
# Dans les cas de plusieurs pays dans une cellule on conserve le premier
df['countries_fr'] = df['countries_fr'].str.split(',', n=1, expand=True)[0]
# tableau des effectifs
def tab_effectifs(df,col):
effectifs = df[col].value_counts()
modalites = effectifs.index # contient les modalités
tab = pd.DataFrame(modalites, columns = [col]) # création du tableau à partir des modalités
tab["nb_occurences"] = effectifs.values
tab = tab.sort_values("nb_occurences",ascending=False) # tri des valeurs de la variable X (croissant)
return tab
# on visualise
tab_effectifs(df,'countries_fr')
| countries_fr | nb_occurences | |
|---|---|---|
| 0 | États-Unis | 171742 |
| 1 | France | 64553 |
| 2 | Suisse | 11087 |
| 3 | Allemagne | 4939 |
| 4 | Espagne | 2977 |
| 5 | Royaume-Uni | 1670 |
| 6 | Belgique | 1313 |
| 7 | Russie | 846 |
| 8 | Australie | 575 |
| 9 | Italie | 435 |
| 10 | Portugal | 385 |
| 11 | Canada | 247 |
| 12 | Pays-Bas | 233 |
| 13 | Danemark | 184 |
| 14 | Autriche | 146 |
| 15 | Hongrie | 125 |
| 16 | Brésil | 88 |
| 17 | Roumanie | 76 |
| 18 | Serbie | 75 |
| 19 | Nouvelle-Zélande | 61 |
| 20 | Pologne | 59 |
| 21 | Suède | 55 |
| 22 | Irlande | 48 |
| 23 | Tunisie | 46 |
| 24 | Guadeloupe | 43 |
| 25 | Norvège | 35 |
| 26 | Luxembourg | 32 |
| 27 | Hong Kong | 32 |
| 28 | Thaïlande | 29 |
| 29 | Mexique | 28 |
| 30 | Saint-Pierre-et-Miquelon | 26 |
| 31 | Guyane | 26 |
| 32 | République tchèque | 26 |
| 34 | La Réunion | 24 |
| 33 | Chine | 24 |
| 35 | Chili | 23 |
| 36 | Turquie | 21 |
| 37 | Sénégal | 18 |
| 38 | Bulgarie | 17 |
| 39 | Taiwan | 17 |
| 40 | Colombie | 17 |
| 41 | Japon | 15 |
| 42 | Afrique du Sud | 15 |
| 43 | Union européenne | 15 |
| 44 | Grèce | 14 |
| 45 | Algérie | 14 |
| 46 | Polynésie française | 13 |
| 47 | Bosnie-Herzégovine | 13 |
| 48 | Quebec | 12 |
| 49 | Argentine | 11 |
| 50 | Singapour | 11 |
| 51 | Finlande | 10 |
| 52 | en:Azərbaycan | 9 |
| 53 | Cambodge | 9 |
| 54 | Israël | 8 |
| 55 | Andorre | 8 |
| 56 | Inde | 7 |
| 57 | Monténégro | 7 |
| 58 | Cuba | 7 |
| 59 | en:Scotland | 6 |
| 60 | en:fruit-yogurts | 6 |
| 61 | Nouvelle-Calédonie | 6 |
| 62 | Pérou | 6 |
| 67 | Albanie | 5 |
| 70 | Slovénie | 5 |
| 69 | Maroc | 5 |
| 68 | Corée du Sud | 5 |
| 63 | Croatie | 5 |
| 66 | en:السعودية | 5 |
| 64 | Liban | 5 |
| 65 | Slovaquie | 5 |
| 71 | en:stirred-yogurts | 4 |
| 72 | Martinique | 4 |
| 73 | World | 4 |
| 74 | Qatar | 4 |
| 75 | Émirats arabes unis | 4 |
| 81 | Côte d'Ivoire | 3 |
| 84 | Lituanie | 3 |
| 83 | en:Deutschland | 3 |
| 82 | en:yogurts | 3 |
| 78 | Moldavie | 3 |
| 80 | Burkina Faso | 3 |
| 79 | Azerbaïdjan | 3 |
| 77 | Irak | 3 |
| 76 | en:سلطنة-عمان | 3 |
| 90 | en:Nederland | 2 |
| 93 | Biélorussie | 2 |
| 92 | Arabie saoudite | 2 |
| 91 | en:whole-milk-yogurts | 2 |
| 89 | en:Turkiye | 2 |
| 87 | Oman | 2 |
| 86 | Philippines | 2 |
| 85 | Indonésie | 2 |
| 88 | Bahreïn | 2 |
| 110 | Arménie | 1 |
| 118 | République du Congo | 1 |
| 112 | en:Island | 1 |
| 113 | Other-japon | 1 |
| 114 | en:Republique-de-chine | 1 |
| 115 | Ukraine | 1 |
| 116 | en:الإمارات-العربية-المتحدة | 1 |
| 117 | Kazakhstan | 1 |
| 120 | Togo | 1 |
| 119 | en:香港 | 1 |
| 121 | en:Denemarken | 1 |
| 122 | Malte | 1 |
| 123 | en:Belgie | 1 |
| 124 | Koweït | 1 |
| 125 | Islande | 1 |
| 111 | en:Gulf-countries | 1 |
| 97 | Costa Rica | 1 |
| 109 | Géorgie | 1 |
| 108 | Maurice | 1 |
| 107 | en:dairies | 1 |
| 106 | Panama | 1 |
| 105 | Other-turquie | 1 |
| 104 | République de Macédoine | 1 |
| 103 | en:Tunisie | 1 |
| 102 | Guyana | 1 |
| 101 | Pakistan | 1 |
| 100 | Égypte | 1 |
| 99 | Malaisie | 1 |
| 98 | en:भारत | 1 |
| 96 | Chypre | 1 |
| 95 | Aruba | 1 |
| 94 | Venezuela | 1 |
| 126 | Kenya | 1 |
# chargement des données complémentaires
df_countries = pd.read_csv("assets/datas/liste-197-etats-2020.csv",sep=";")
# visualisation du dataframe
df_countries.head()
| NOM | NOM_ALPHA | CODE | ARTICLE | NOM_LONG | CAPITALE | |
|---|---|---|---|---|---|---|
| 0 | Afghanistan | Afghanistan | AFG | l' | République islamique d'Afghanistan | Kaboul |
| 1 | Afrique du Sud | Afrique du Sud | ZAF | l' | République d'Afrique du Sud | Prétoria |
| 2 | Albanie | Albanie | ALB | l' | République d'Albanie | Tirana |
| 3 | Algérie | Algérie | DZA | l' | République algérienne démocratique et populaire | Alger |
| 4 | Allemagne | Allemagne | DEU | l' | République fédérale d'Allemagne | Berlin |
# On définit les pays valides
VALID_COUNTRIES = df_countries['NOM'].unique()
# On applique un mask pour écarter les pays qui ne sont pas valides
mask = ~df['countries_fr'].isin(VALID_COUNTRIES)
df.loc[mask, 'countries_fr'] = np.NaN
msno.bar(df.iloc[:,16:19])
<AxesSubplot: >
tab_effectifs(df,'countries_fr')
| countries_fr | nb_occurences | |
|---|---|---|
| 0 | États-Unis | 171742 |
| 1 | France | 64553 |
| 2 | Suisse | 11087 |
| 3 | Allemagne | 4939 |
| 4 | Espagne | 2977 |
| 5 | Royaume-Uni | 1670 |
| 6 | Belgique | 1313 |
| 7 | Russie | 846 |
| 8 | Australie | 575 |
| 9 | Italie | 435 |
| 10 | Portugal | 385 |
| 11 | Canada | 247 |
| 12 | Pays-Bas | 233 |
| 13 | Danemark | 184 |
| 14 | Autriche | 146 |
| 15 | Hongrie | 125 |
| 16 | Brésil | 88 |
| 17 | Roumanie | 76 |
| 18 | Serbie | 75 |
| 19 | Nouvelle-Zélande | 61 |
| 20 | Pologne | 59 |
| 21 | Suède | 55 |
| 22 | Irlande | 48 |
| 23 | Tunisie | 46 |
| 24 | Norvège | 35 |
| 25 | Luxembourg | 32 |
| 26 | Thaïlande | 29 |
| 27 | Mexique | 28 |
| 28 | Chine | 24 |
| 29 | Chili | 23 |
| 30 | Turquie | 21 |
| 31 | Sénégal | 18 |
| 33 | Colombie | 17 |
| 32 | Bulgarie | 17 |
| 34 | Afrique du Sud | 15 |
| 35 | Japon | 15 |
| 36 | Grèce | 14 |
| 37 | Algérie | 14 |
| 38 | Bosnie-Herzégovine | 13 |
| 39 | Singapour | 11 |
| 40 | Argentine | 11 |
| 41 | Finlande | 10 |
| 42 | Cambodge | 9 |
| 43 | Israël | 8 |
| 44 | Andorre | 8 |
| 45 | Inde | 7 |
| 46 | Cuba | 7 |
| 47 | Monténégro | 7 |
| 48 | Pérou | 6 |
| 52 | Croatie | 5 |
| 55 | Maroc | 5 |
| 53 | Liban | 5 |
| 54 | Corée du Sud | 5 |
| 51 | Slovaquie | 5 |
| 50 | Slovénie | 5 |
| 49 | Albanie | 5 |
| 56 | Émirats arabes unis | 4 |
| 57 | Qatar | 4 |
| 58 | Irak | 3 |
| 59 | Azerbaïdjan | 3 |
| 60 | Moldavie | 3 |
| 61 | Lituanie | 3 |
| 62 | Côte d'Ivoire | 3 |
| 66 | Bahreïn | 2 |
| 68 | Arabie saoudite | 2 |
| 67 | Philippines | 2 |
| 64 | Indonésie | 2 |
| 65 | Oman | 2 |
| 63 | Biélorussie | 2 |
| 83 | Islande | 1 |
| 82 | Togo | 1 |
| 81 | Maurice | 1 |
| 80 | Ukraine | 1 |
| 79 | Malaisie | 1 |
| 78 | Koweït | 1 |
| 77 | Chypre | 1 |
| 76 | Kazakhstan | 1 |
| 75 | Géorgie | 1 |
| 74 | Costa Rica | 1 |
| 73 | Arménie | 1 |
| 72 | Pakistan | 1 |
| 71 | Égypte | 1 |
| 70 | Guyana | 1 |
| 69 | Malte | 1 |
| 84 | Panama | 1 |
# Visualiser les 5 pays les plkus représentés
country = df.groupby(by='countries_fr')['code'].nunique().sort_values(ascending=False)
n=5
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(aspect="equal"))
explodes = np.zeros(n)
explodes[0] = .1
plt.pie(country[:n], labels=country[:n].index,
startangle=45,
textprops=dict(color="black",size=10,
weight="bold"))
plt.title("Les {:d} pays les plus représentés".format(n), fontweight='bold',fontsize=24)
plt.show()
# On regarde les valeurs manquantes dans code
code_nan = df.loc[df['code'].isnull(),:]
# On regarde les doublons de code
code_duplicated = df.loc[df['code'].duplicated(keep=False),:]
code_duplicated.shape
(16, 54)
# On compare code_nan et code_dupuplicated
code_nan.equals(code_duplicated)
True
df_complet.shape[0] - df.shape[0]
0
# On supprime les 23 lignes en doublons qui sont aussi des données manquantes car aucun moyen de compléter ces lignes (aucun indice disponible)à
df = df[~(df.duplicated(["code"],keep=False))]
# df_row_complet.shape[0] - df.shape[0]
tools.get_info_data(df)
------------------------------------------------------------------ Taille du jeu de données Nombre de lignes : 262817 lignes Nombre de colonnes : 54 colonnes ------------------------------------------------------------------
# Nombre de créateurs, sources des données
print(f"Nombre de sources unique : {df['creator'].nunique()}")
Nombre de sources unique : 2528
creators = df.groupby(by='creator')['code'].nunique().sort_values(ascending=False)
# Graphiques des 5 plus ganrd contributeur
n= 6
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(aspect="equal"))
explodes = np.zeros(n)
explodes[0] = .1
plt.pie(creators[:n], labels=creators[:n].index,
startangle=45,
shadow=True,
explode=explodes,
textprops=dict(color="black",size=12, weight="bold"))
plt.title("Les {:d} plus grands contributeurs".format(n),
fontweight='bold',fontsize=24)
plt.show()
tab = tab_effectifs(df,'product_name')
# On affiche les produits qui apparaissent le plus
tab.head(10)
| product_name | nb_occurences | |
|---|---|---|
| 0 | Ice Cream | 410 |
| 1 | Extra Virgin Olive Oil | 302 |
| 2 | Potato Chips | 281 |
| 3 | Premium Ice Cream | 226 |
| 4 | Tomato Ketchup | 178 |
| 5 | Beef Jerky | 167 |
| 6 | Pinto Beans | 162 |
| 7 | Cookies | 154 |
| 8 | Popcorn | 152 |
| 9 | Salsa | 149 |
# On isole les produits sdans nom
df_product_nan = df[(df['product_name'].isnull())]
tools.get_info_data(df_product_nan)
------------------------------------------------------------------ Taille du jeu de données Nombre de lignes : 3383 lignes Nombre de colonnes : 54 colonnes ------------------------------------------------------------------
# On estime le nombre de produits sans nom
df_product_nan.shape
(3383, 54)
# On suprime les produits qui n'ont pas de nom
old_product_shape=df.shape[0]
df = df[~(df['product_name'].isnull())]
print(f"On a supprimé {old_product_shape - df.shape[0]} produits sans nom")
On a supprimé 3383 produits sans nom
# On estime la taille du jeu de données
tools.get_info_data(df)
------------------------------------------------------------------ Taille du jeu de données Nombre de lignes : 259434 lignes Nombre de colonnes : 54 colonnes ------------------------------------------------------------------
# On crée une copie de notre jeu de donnée à ce stade du nettoyage
df_complet = df.copy()
# df = df_complet.copy()
category_columns = ['categories','categories_tags','categories_fr','main_category','main_category_fr','pnns_groups_1','pnns_groups_2']
df[df[category_columns].notnull().any(axis=1)][ category_columns].sample(5)
| categories | categories_tags | categories_fr | main_category | main_category_fr | pnns_groups_1 | pnns_groups_2 | |
|---|---|---|---|---|---|---|---|
| 239687 | Aliments et boissons à base de végétaux,Boisso... | en:plant-based-foods-and-beverages,en:beverage... | Aliments et boissons à base de végétaux,Boisso... | en:beverages | Boissons | Beverages | Non-sugared beverages |
| 215837 | Biscuits | en:sugary-snacks,en:biscuits-and-cakes,en:bisc... | Snacks sucrés,Biscuits et gâteaux,Biscuits | en:biscuits | Biscuits | Sugary snacks | Biscuits and cakes |
| 182583 | Produit laitier | en:dairies | Produits laitiers | en:dairies | Produits laitiers | unknown | unknown |
| 221949 | Produits laitiers,Yaourts,Yaourts aux fruits,Y... | en:dairies,en:yogurts,en:fruit-yogurts,en:drin... | Produits laitiers,Yaourts,Yaourts aux fruits,Y... | en:whole-milk-yogurts | Yaourts entiers | Milk and dairy products | Milk and yogurt |
| 212710 | Aliments et boissons à base de végétaux,Alimen... | en:plant-based-foods-and-beverages,en:plant-ba... | Aliments et boissons à base de végétaux,Alimen... | en:breads | Pains | Cereals and potatoes | Bread |
msno.bar(df[category_columns])
<AxesSubplot: >
print(f"Nombre de catégorie unique : {df['categories'].nunique()}")
print(f"Nombre de main_categorie unique : {df['main_category_fr'].nunique()}")
Nombre de catégorie unique : 28635 Nombre de main_categorie unique : 2355
# On remplace na par inconnu
df['categories_fr'].fillna('inconnu',inplace=True)
df['main_category_fr'].fillna('inconnu',inplace=True)
# On visualise : représentation graphique :
main_category = df.groupby(by='main_category_fr')['code'].nunique().sort_values(ascending=False)
# Graphiques des 5 plus ganrd contributeur
n= 8
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(aspect="equal"))
explodes = np.zeros(n)
explodes[0] = .1
plt.pie(main_category[:n], labels=main_category[:n].index,
startangle=45,
shadow=True,
explode=explodes,
textprops=dict(color="black",size=12, weight="bold"))
plt.title("Les {:d} catégories les plus renseignées".format(n),
fontweight='bold',fontsize=24)
plt.show()
# On crée une fonction pour compter les occurences qui prend en compte plusieurs valeurs separées par des ','
def top_words(df, column="countries_fr", nb_top=10):
count_keyword = dict()
for index, col in df[column].iteritems():
if isinstance(col, float):
continue
for word in col.split(','):
if word in count_keyword.keys():
count_keyword[word] += 1
else :
count_keyword[word] = 1
keyword_top = []
for k,v in count_keyword.items():
keyword_top.append([k,v])
keyword_top.sort(key = lambda x:x[1], reverse = True)
return keyword_top[:nb_top]
df_top_categories_fr = pd.DataFrame(top_words(df, column="main_category_fr", nb_top=10),
columns=["Keyword","count"])
df_top_categories_fr
| Keyword | count | |
|---|---|---|
| 0 | inconnu | 195430 |
| 1 | Epicerie | 2418 |
| 2 | Boissons | 2410 |
| 3 | Chocolats | 2347 |
| 4 | Aliments et boissons à base de végétaux | 2336 |
| 5 | Conserves | 2158 |
| 6 | Biscuits | 1894 |
| 7 | Plats préparés | 1844 |
| 8 | Surgelés | 1741 |
| 9 | Petit-déjeuners | 1656 |
from wordcloud import WordCloud
def plot_world_cloud(df,column="main_category_fr",nb_top=100):
'''
Fonction qui permet affiche le top (defini de la colonne définie)
'''
fig = plt.figure(1, figsize=(20,15))
ax1 = fig.add_subplot(1,1,1)
words = dict()
trunc_occurences = top_words(df=df, column=column, nb_top=nb_top)
for s in trunc_occurences:
words[s[0]] = s[1]
word_cloud = WordCloud(width=900,height=500, normalize_plurals=False, background_color="white")
word_cloud.generate_from_frequencies(words)
ax1.imshow(word_cloud, interpolation="bilinear")
ax1.axis('off')
plt.title("Nuage de mots des {} meilleures {}\n".format(nb_top, column), fontsize=22,fontweight='bold')
plt.show()
# On visualise le top 100 des catégories principales
plot_world_cloud(df,column="main_category_fr",nb_top=100)
# On visualise le top 100 des categories_fr
plot_world_cloud(df,column="categories_fr",nb_top=100)
tools.get_missing_value(df,True,True)
Nombre total de données manquantes dans le dataframe : 3385213 données manquantes sur 14009436 (24.16%) ------------------------------------------------------------- Nombre et pourcentage de valeurs manquantes par variable
| Nombres de valeurs manquantes | % de valeurs manquantes | |
|---|---|---|
| image_small_url | 210582 | 81.170000 |
| image_url | 210582 | 81.170000 |
| packaging_tags | 199225 | 76.790000 |
| packaging | 199225 | 76.790000 |
| categories | 195430 | 75.330000 |
| categories_tags | 195430 | 75.330000 |
| main_category | 195430 | 75.330000 |
| pnns_groups_1 | 191372 | 73.770000 |
| pnns_groups_2 | 191170 | 73.690000 |
| quantity | 190174 | 73.300000 |
| vitamin-a_100g | 122036 | 47.040000 |
| iron_100g | 119129 | 45.920000 |
| vitamin-c_100g | 118779 | 45.780000 |
| calcium_100g | 118597 | 45.710000 |
| trans-fat_100g | 116275 | 44.820000 |
| cholesterol_100g | 115484 | 44.510000 |
| additives_tags | 109932 | 42.370000 |
| additives_fr | 109932 | 42.370000 |
| fiber_100g | 60847 | 23.450000 |
| serving_size | 51265 | 19.760000 |
| nutrition-score-uk_100g | 40971 | 15.790000 |
| nutrition-score-fr_100g | 40971 | 15.790000 |
| nutrition_grade_fr | 40971 | 15.790000 |
| saturated-fat_100g | 32793 | 12.640000 |
| additives | 25522 | 9.840000 |
| ingredients_text | 25500 | 9.830000 |
| ingredients_that_may_be_from_palm_oil_n | 25500 | 9.830000 |
| ingredients_from_palm_oil_n | 25500 | 9.830000 |
| additives_n | 25500 | 9.830000 |
| carbohydrates_100g | 19158 | 7.380000 |
| fat_100g | 18850 | 7.270000 |
| sugars_100g | 17524 | 6.750000 |
| sodium_100g | 6946 | 2.680000 |
| salt_100g | 6907 | 2.660000 |
| brands | 3371 | 1.300000 |
| brands_tags | 3372 | 1.300000 |
| proteins_100g | 2829 | 1.090000 |
| energy_100g | 1661 | 0.640000 |
| countries_fr | 341 | 0.130000 |
| countries | 65 | 0.030000 |
| countries_tags | 65 | 0.030000 |
------------------------------------------------------------------ Visualisation des données manquantes
# On visualise les doublons brands
df_marque = df[(df.duplicated(["product_name","brands"],keep="last")) | ((df['product_name'].isnull()) & (df['brands'].isnull()))]
df_marque.tail()
| code | url | creator | created_t | created_datetime | last_modified_t | last_modified_datetime | product_name | quantity | packaging | packaging_tags | brands | brands_tags | categories | categories_tags | categories_fr | countries | countries_tags | countries_fr | ingredients_text | serving_size | additives_n | additives | additives_tags | additives_fr | ingredients_from_palm_oil_n | ingredients_that_may_be_from_palm_oil_n | nutrition_grade_fr | pnns_groups_1 | pnns_groups_2 | states | states_tags | states_fr | main_category | main_category_fr | image_url | image_small_url | energy_100g | fat_100g | saturated-fat_100g | trans-fat_100g | cholesterol_100g | carbohydrates_100g | sugars_100g | fiber_100g | proteins_100g | salt_100g | sodium_100g | vitamin-a_100g | vitamin-c_100g | calcium_100g | iron_100g | nutrition-score-fr_100g | nutrition-score-uk_100g | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 320259 | 9348603001811 | http://world-fr.openfoodfacts.org/produit/9348... | usda-ndb-import | 2017-03-09 22:11:10 | 2017-03-09 22:11:10 | 2017-03-09 22:11:11 | 2017-03-09 22:11:11 | Greek Nonfat Yogurt | NaN | NaN | NaN | Welch's | welch-s | NaN | NaN | inconnu | US | en:united-states | États-Unis | Cultured pasteurized grade a nonfat milk, suga... | 150 g (1 CONTAINER) | 0.0 | [ cultured-pasteurized-grade-a-nonfat-milk ->... | NaN | NaN | 0.0 | 0.0 | a | NaN | NaN | en:to-be-completed, en:nutrition-facts-complet... | en:to-be-completed,en:nutrition-facts-complete... | A compléter,Informations nutritionnelles compl... | NaN | inconnu | NaN | NaN | 305.0 | 0.0 | 0.0 | 0.0 | 0.003 | 10.00 | 8.67 | 0.0 | 8.67 | 0.0762 | 0.03 | 0.000000 | 0.0000 | 0.100 | 0.0 | -4.0 | -4.0 |
| 320260 | 9348603001828 | http://world-fr.openfoodfacts.org/produit/9348... | usda-ndb-import | 2017-03-09 22:11:11 | 2017-03-09 22:11:11 | 2017-03-09 22:11:11 | 2017-03-09 22:11:11 | Greek Nonfat Yogurt | NaN | NaN | NaN | Welch's | welch-s | NaN | NaN | inconnu | US | en:united-states | États-Unis | Cultured pasteurized grade a nonfat milk, suga... | 150 g (1 CONTAINER) | 0.0 | [ cultured-pasteurized-grade-a-nonfat-milk ->... | NaN | NaN | 0.0 | 0.0 | a | NaN | NaN | en:to-be-completed, en:nutrition-facts-complet... | en:to-be-completed,en:nutrition-facts-complete... | A compléter,Informations nutritionnelles compl... | NaN | inconnu | NaN | NaN | 305.0 | 0.0 | 0.0 | 0.0 | 0.003 | 10.00 | 8.67 | 0.0 | 8.67 | 0.0762 | 0.03 | 0.000000 | 0.0000 | 0.100 | 0.0 | -4.0 | -4.0 |
| 320261 | 9348603001842 | http://world-fr.openfoodfacts.org/produit/9348... | usda-ndb-import | 2017-03-09 22:11:11 | 2017-03-09 22:11:11 | 2017-03-09 22:11:11 | 2017-03-09 22:11:11 | Greek Nonfat Yogurt | NaN | NaN | NaN | Welch's | welch-s | NaN | NaN | inconnu | US | en:united-states | États-Unis | Cultured pasteurized grade a nonfat milk, suga... | 150 g (1 CONTAINER) | 0.0 | [ cultured-pasteurized-grade-a-nonfat-milk ->... | NaN | NaN | 0.0 | 0.0 | a | NaN | NaN | en:to-be-completed, en:nutrition-facts-complet... | en:to-be-completed,en:nutrition-facts-complete... | A compléter,Informations nutritionnelles compl... | NaN | inconnu | NaN | NaN | 335.0 | 0.0 | 0.0 | 0.0 | 0.003 | 11.33 | 10.67 | 0.0 | 8.67 | 0.0762 | 0.03 | 0.000000 | 0.0000 | 0.100 | 0.0 | -3.0 | -3.0 |
| 320265 | 9348603002078 | http://world-fr.openfoodfacts.org/produit/9348... | usda-ndb-import | 2017-03-09 22:11:11 | 2017-03-09 22:11:11 | 2017-03-09 22:11:12 | 2017-03-09 22:11:12 | Blended Nonfat Yogurt | NaN | NaN | NaN | Welch's | welch-s | NaN | NaN | inconnu | US | en:united-states | États-Unis | Cultured pasteurized grade a nonfat milk, suga... | 170 g (1 CONTAINER) | 2.0 | [ cultured-pasteurized-grade-a-nonfat-milk ->... | en:e428,en:e418 | E428 - Gélatine,E418 - Gomme gellane | 0.0 | 0.0 | b | NaN | NaN | en:to-be-completed, en:nutrition-facts-complet... | en:to-be-completed,en:nutrition-facts-complete... | A compléter,Informations nutritionnelles compl... | NaN | inconnu | NaN | NaN | 297.0 | 0.0 | 0.0 | 0.0 | 0.000 | 13.53 | 10.59 | 0.0 | 2.94 | 0.1270 | 0.05 | 0.000088 | 0.0007 | 0.118 | 0.0 | 1.0 | 1.0 |
| 320267 | 9348603002092 | http://world-fr.openfoodfacts.org/produit/9348... | usda-ndb-import | 2017-03-09 22:11:12 | 2017-03-09 22:11:12 | 2017-03-09 22:11:12 | 2017-03-09 22:11:12 | Blended Nonfat Yogurt | NaN | NaN | NaN | Welch's | welch-s | NaN | NaN | inconnu | US | en:united-states | États-Unis | Cultured pasteurized grade a nonfat milk, suga... | 170 g (1 CONTAINER) | 2.0 | [ cultured-pasteurized-grade-a-nonfat-milk ->... | en:e428,en:e418 | E428 - Gélatine,E418 - Gomme gellane | 0.0 | 0.0 | b | NaN | NaN | en:to-be-completed, en:nutrition-facts-complet... | en:to-be-completed,en:nutrition-facts-complete... | A compléter,Informations nutritionnelles compl... | NaN | inconnu | NaN | NaN | 297.0 | 0.0 | 0.0 | 0.0 | 0.000 | 14.71 | 11.76 | 0.0 | 2.94 | 0.1270 | 0.05 | 0.000088 | 0.0007 | 0.118 | 0.0 | 1.0 | 1.0 |
# On supprime les doublons des produits de la même marque en conservant les valeurs nulles
df = df[(~df.duplicated(["product_name","brands"],keep="last")) | ((df['product_name'].isnull()) & (df['brands'].isnull()))]
print(f"On a supprimé : {data.shape[0]-df.shape[0]} lignes en doublons")
On a supprimé : 79617 lignes en doublons
print(f"Après cette première phase de nettoyage, il nous reste {round((df.shape[0]/data.shape[0])*100,2)}% des lignes et {round((df.shape[1]/data.shape[1])*100,2)}% des colonnes du jeu données.")
Après cette première phase de nettoyage, il nous reste 75.18% des lignes et 33.33% des colonnes du jeu données.
On surveille :
df.columns
Index(['code', 'url', 'creator', 'created_t', 'created_datetime',
'last_modified_t', 'last_modified_datetime', 'product_name', 'quantity',
'packaging', 'packaging_tags', 'brands', 'brands_tags', 'categories',
'categories_tags', 'categories_fr', 'countries', 'countries_tags',
'countries_fr', 'ingredients_text', 'serving_size', 'additives_n',
'additives', 'additives_tags', 'additives_fr',
'ingredients_from_palm_oil_n',
'ingredients_that_may_be_from_palm_oil_n', 'nutrition_grade_fr',
'pnns_groups_1', 'pnns_groups_2', 'states', 'states_tags', 'states_fr',
'main_category', 'main_category_fr', 'image_url', 'image_small_url',
'energy_100g', 'fat_100g', 'saturated-fat_100g', 'trans-fat_100g',
'cholesterol_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
'proteins_100g', 'salt_100g', 'sodium_100g', 'vitamin-a_100g',
'vitamin-c_100g', 'calcium_100g', 'iron_100g',
'nutrition-score-fr_100g', 'nutrition-score-uk_100g'],
dtype='object')
# On crée une copie pour travailler sur df
df_complet = df.copy()
# Réinitialiser le df
# df = df_complet.copy()
# Description des données catégorielles
tools.get_description_variables(df,'categ')
| count | unique | top | freq | first | last | |
|---|---|---|---|---|---|---|
| code | 241155 | 241155 | 0000000004530 | 1 | NaT | NaT |
| url | 241155 | 241155 | http://world-fr.openfoodfacts.org/produit/0000... | 1 | NaT | NaT |
| creator | 241155 | 2483 | usda-ndb-import | 154560 | NaT | NaT |
| created_t | 241155 | 126633 | 2017-03-09 10:37:09 | 19 | 2012-01-31 14:43:58 | 2017-04-20 21:13:06 |
| created_datetime | 241155 | 126633 | 2017-03-09 10:37:09 | 19 | 2012-01-31 14:43:58 | 2017-04-20 21:13:06 |
| last_modified_t | 241155 | 119931 | 2015-08-09 17:35:48 | 22 | 2012-04-08 08:12:35 | 2017-04-21 00:53:41 |
| last_modified_datetime | 241155 | 119931 | 2015-08-09 17:35:48 | 22 | 2012-04-08 08:12:35 | 2017-04-21 00:53:41 |
| product_name | 241155 | 187506 | Extra Virgin Olive Oil | 197 | NaT | NaT |
| quantity | 66635 | 9561 | 500 g | 3334 | NaT | NaT |
| packaging | 57879 | 11985 | Carton | 1796 | NaT | NaT |
| packaging_tags | 57879 | 9989 | sachet,plastique | 3122 | NaT | NaT |
| brands | 237992 | 46402 | Carrefour | 2368 | NaT | NaT |
| brands_tags | 237991 | 40851 | carrefour | 2423 | NaT | NaT |
| categories | 61509 | 28129 | Snacks sucrés,Biscuits et gâteaux,Biscuits | 267 | NaT | NaT |
| categories_tags | 61509 | 16302 | en:sugary-snacks,en:biscuits-and-cakes,en:bisc... | 692 | NaT | NaT |
| categories_fr | 241155 | 16303 | inconnu | 179646 | NaT | NaT |
| countries | 241092 | 962 | US | 154536 | NaT | NaT |
| countries_tags | 241092 | 512 | en:united-states | 156246 | NaT | NaT |
| countries_fr | 240823 | 80 | États-Unis | 156260 | NaT | NaT |
| ingredients_text | 216293 | 183258 | Almonds. | 194 | NaT | NaT |
| serving_size | 191355 | 24419 | 240 ml (8 fl oz) | 4918 | NaT | NaT |
| additives | 216273 | 175958 | [ extra-virgin-olive-oil -> en:extra-virgin-o... | 289 | NaT | NaT |
| additives_tags | 137930 | 38523 | en:e322 | 7393 | NaT | NaT |
| additives_fr | 137930 | 38523 | E322 - Lécithines | 7393 | NaT | NaT |
| nutrition_grade_fr | 203974 | 5 | d | 57804 | NaT | NaT |
| pnns_groups_1 | 65438 | 14 | unknown | 12074 | NaT | NaT |
| pnns_groups_2 | 65634 | 42 | unknown | 12074 | NaT | NaT |
| states | 241155 | 393 | en:to-be-completed, en:nutrition-facts-complet... | 153751 | NaT | NaT |
| states_tags | 241155 | 393 | en:to-be-completed,en:nutrition-facts-complete... | 153751 | NaT | NaT |
| states_fr | 241155 | 393 | A compléter,Informations nutritionnelles compl... | 153751 | NaT | NaT |
| main_category | 61509 | 2324 | en:beverages | 2296 | NaT | NaT |
| main_category_fr | 241155 | 2325 | inconnu | 179646 | NaT | NaT |
| image_url | 46680 | 46680 | http://fr.openfoodfacts.org/images/products/00... | 1 | NaT | NaT |
| image_small_url | 46680 | 46680 | http://fr.openfoodfacts.org/images/products/00... | 1 | NaT | NaT |
# tools.graph_NAN_with_start_end(data,False,63,162,"NUTRITION FACTS : Répartition des NaN par colonne ayant suffixe _100g")
# Sélection les colonnes de données physico-chimique sur lesquelles on travail
coll_nutrition_facts = ['fat_100g',
'saturated-fat_100g', 'trans-fat_100g', 'cholesterol_100g',
'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g',
'salt_100g', 'sodium_100g', 'vitamin-a_100g', 'vitamin-c_100g',
'calcium_100g', 'iron_100g']
# Description des données numérique
tools.get_description_variables(df,'num')
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| additives_n | 216293.0 | 1.988936 | 2.521034 | 0.00000 | 0.0000 | 1.00000 | 3.000000 | 3.100000e+01 |
| ingredients_from_palm_oil_n | 216293.0 | 0.019557 | 0.140197 | 0.00000 | 0.0000 | 0.00000 | 0.000000 | 2.000000e+00 |
| ingredients_that_may_be_from_palm_oil_n | 216293.0 | 0.055679 | 0.269818 | 0.00000 | 0.0000 | 0.00000 | 0.000000 | 6.000000e+00 |
| energy_100g | 239532.0 | 1143.505902 | 6722.419156 | 0.00000 | 385.0000 | 1100.00000 | 1674.000000 | 3.251373e+06 |
| fat_100g | 222717.0 | 12.671251 | 17.373254 | 0.00000 | 0.0000 | 5.08000 | 20.000000 | 7.142900e+02 |
| saturated-fat_100g | 211313.0 | 5.131128 | 8.062585 | 0.00000 | 0.0000 | 1.79000 | 7.140000 | 5.500000e+02 |
| trans-fat_100g | 130675.0 | 0.073496 | 1.542045 | -3.03000 | 0.0000 | 0.00000 | 0.000000 | 3.690000e+02 |
| cholesterol_100g | 131524.0 | 0.020254 | 0.374608 | 0.00000 | 0.0000 | 0.00000 | 0.020000 | 9.523800e+01 |
| carbohydrates_100g | 222392.0 | 32.219306 | 29.779111 | 0.00000 | 6.0000 | 21.00000 | 58.820000 | 2.916670e+03 |
| sugars_100g | 225038.0 | 15.888735 | 22.306591 | -17.86000 | 1.3000 | 5.60000 | 24.000000 | 3.520000e+03 |
| fiber_100g | 184937.0 | 2.880747 | 13.354522 | -6.70000 | 0.0000 | 1.50000 | 3.600000 | 5.380000e+03 |
| proteins_100g | 238433.0 | 7.115344 | 8.409730 | -800.00000 | 0.7100 | 4.85000 | 10.000000 | 4.300000e+02 |
| salt_100g | 234468.0 | 2.073390 | 133.874857 | 0.00000 | 0.0635 | 0.59182 | 1.384300 | 6.431280e+04 |
| sodium_100g | 234429.0 | 0.816429 | 52.711019 | 0.00000 | 0.0250 | 0.23300 | 0.545000 | 2.532000e+04 |
| vitamin-a_100g | 125829.0 | 0.000421 | 0.076616 | -0.00034 | 0.0000 | 0.00000 | 0.000107 | 2.670000e+01 |
| vitamin-c_100g | 128763.0 | 0.023660 | 2.330014 | -0.00210 | 0.0000 | 0.00000 | 0.003700 | 7.169811e+02 |
| calcium_100g | 128992.0 | 0.122060 | 2.879942 | 0.00000 | 0.0000 | 0.03500 | 0.105000 | 6.136360e+02 |
| iron_100g | 128469.0 | 0.003826 | 0.224189 | -0.00026 | 0.0000 | 0.00103 | 0.002400 | 5.000000e+01 |
| nutrition-score-fr_100g | 203974.0 | 9.146676 | 9.059450 | -15.00000 | 1.0000 | 10.00000 | 16.000000 | 4.000000e+01 |
| nutrition-score-uk_100g | 203974.0 | 9.042662 | 9.190223 | -15.00000 | 1.0000 | 9.00000 | 16.000000 | 4.000000e+01 |
On remarque déjà que les valeurs minimum et maximum de certaines variables contiennent des valeurs aberrantes, ce qui impacte l'écart-type et la variance.
On complète avec les fiches des produits à l'adresse suivante :
- exemple : https://world-fr.openfoodfacts.org/produit/3257983143096/pois-casses-cora
df_error_max = df.loc[df['energy_100g']>13000]
df_error_max[['code','energy_100g']].sort_values(by="energy_100g",ascending=False)
| code | energy_100g | |
|---|---|---|
| 212928 | 3257983143096 | 3251373.0 |
| 313506 | 8710573641501 | 231199.0 |
| 249101 | 3661405001053 | 182764.0 |
| 112681 | 0201203040026 | 110579.0 |
| 119184 | 0619309100979 | 94140.0 |
| 176697 | 2000000045489 | 22000.0 |
| 245311 | 3596710288755 | 18700.0 |
| 223318 | 3291960006127 | 15481.0 |
| 46967 | 0041390030512 | 14644.0 |
| 115777 | 0444444387721 | 14347.0 |
| 35308 | 0038233241334 | 13213.0 |
# On remplace les valeurs abberantes fausse d'après leur fiche produit
dict = {
"3257983143096" : 1373,
"8710573641501" : 2312,
"3661405001053" : 182,
"0201203040026" : 3700,
"0619309100979" : 92,
"2000000045489" : 2200,
"3596710288755" : 2807,
"3291960006127" : 3766,
"0041390030512" : 1464,
"0444444387721" : 1393,
}
for cle, valeur in dict.items():
print("l'élément de clé", cle, "vaut", valeur)
l'élément de clé 3257983143096 vaut 1373 l'élément de clé 8710573641501 vaut 2312 l'élément de clé 3661405001053 vaut 182 l'élément de clé 0201203040026 vaut 3700 l'élément de clé 0619309100979 vaut 92 l'élément de clé 2000000045489 vaut 2200 l'élément de clé 3596710288755 vaut 2807 l'élément de clé 3291960006127 vaut 3766 l'élément de clé 0041390030512 vaut 1464 l'élément de clé 0444444387721 vaut 1393
# On remplace par les bonnes valeurs dans le df
for cle, valeur in dict.items():
df.loc[df['code']==cle,'energy_100g'] = valeur
# Df qui contient tous les produits dont les valeurs sont supérieures à 100
df_sup_100 = df[(df[coll_nutrition_facts] > 100).any(axis=1)]
print(f"Il a à {df_sup_100.shape[0]} éléments supérieur à 100 dans le df")
Il a à 182 éléments supérieur à 100 dans le df
# On supprime les lignes
df = df[~(df[coll_nutrition_facts] > 100).any(axis=1)]
df.shape
(240973, 54)
# Df qui contient tous les produits dont les valeurs sont supérieures à 100
df_inf_0 = df[(df[coll_nutrition_facts] < 0).any(axis=1)]
print(f"Il a à {df_inf_0.shape[0]} éléments inférieur à 0 dans le df")
Il a à 17 éléments inférieur à 0 dans le df
# On supprime les lignes
df = df[~(df[coll_nutrition_facts] < 0).any(axis=1)]
df.shape
(240956, 54)
df_complet = df.copy()
df= df[~((df['saturated-fat_100g'] > df['fat_100g'])
| (df['sodium_100g'] > df['salt_100g']))]
print(f"On a supprimer {df_complet.shape[0] - df.shape[0]} lignes")
On a supprimer 327 lignes
# crée une copy
df_complet = df.copy()
# On test les > 3800 j
# df_max_kj = df[(df['energy_100g'] > 3800)]
# df_max_kj.shape
# df_max_kj.sort_values(by='energy_100g',ascending=False).head()
# On supprime les lignes
df = df[~(df['energy_100g'] > 3800)]
print(f"On a supprimé {df_complet.shape[0] - df.shape[0]} lignes")
On a supprimé 325 lignes
col_selection = [
'code', 'creator', 'created_datetime', 'last_modified_datetime', 'product_name',
'brands','categories_fr','countries_fr','additives_n','additives_fr',
'ingredients_from_palm_oil_n', 'nutrition_grade_fr',
'main_category_fr','energy_100g', 'fat_100g', 'saturated-fat_100g',
'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
'proteins_100g', 'salt_100g', 'sodium_100g', 'nutrition-score-fr_100g'
]
df_app = df[col_selection]
# On crée une copie pour travailler sur df
df_app_complet = df_app.copy()
# Réinitialiser le df
# df_app = df_app_complet.copy()
sns.set(context="paper", font_scale = 1.2)
# compute the correlation matrix for all the numeric columns
corrmat = df_app.corr()
# size of the plot
f, ax = plt.subplots(figsize=(12, 12))
# set the plot heading
f.text(0.45, 0.93, "Coefficients de corrélation", ha='center', fontsize = 18)
# plot matrix as a heatmap
sns.heatmap(corrmat, square=True, linewidths=0.01, cmap="coolwarm")
plt.tight_layout()
tools.get_missing_value(df_app,True,True)
Nombre total de données manquantes dans le dataframe : 385583 données manquantes sur 5526992 (6.98%) ------------------------------------------------------------- Nombre et pourcentage de valeurs manquantes par variable
| Nombres de valeurs manquantes | % de valeurs manquantes | |
|---|---|---|
| additives_fr | 102639 | 42.710000 |
| fiber_100g | 55918 | 23.270000 |
| nutrition_grade_fr | 36899 | 15.360000 |
| nutrition-score-fr_100g | 36899 | 15.360000 |
| saturated-fat_100g | 29725 | 12.370000 |
| additives_n | 24730 | 10.290000 |
| ingredients_from_palm_oil_n | 24730 | 10.290000 |
| carbohydrates_100g | 18670 | 7.770000 |
| fat_100g | 18352 | 7.640000 |
| sugars_100g | 15882 | 6.610000 |
| sodium_100g | 6700 | 2.790000 |
| salt_100g | 6661 | 2.770000 |
| brands | 3145 | 1.310000 |
| proteins_100g | 2689 | 1.120000 |
| energy_100g | 1614 | 0.670000 |
| countries_fr | 330 | 0.140000 |
------------------------------------------------------------------ Visualisation des données manquantes
Additive_n : Soit le nombre est inconnu soit il n'est pas renseigné, On décide de remplacer les NA par 0
df_app['additives_n'] = df_app['additives_n'].fillna(0)
Nombre d'ingrédients provenant d'huile de palme : (0,nan,1 ou 2 sont les valeurs possibles) : Ne pouvant les connaitre, on choisit que lesw na sont soit des produit pour les quelles on a pas renseigné la valeur soit qu'il ne contiennent pas d'ingrédients contenant de l'huile de Palme ==> fillna(0)
df_app['ingredients_from_palm_oil_n'] = df_app['ingredients_from_palm_oil_n'].fillna(0)
Le nom des Pays : On a tenté de lier certain nom de marque à des pays comme Danone mais le pays n'est pas liée à la marque : on remplace les NA par la valeur "monde" référent à une multinationale.
df_app['countries_fr'] = df_app['countries_fr'].fillna("monde")
Brands : on decide de compléter avec 'inconnue'
df_app['brands'] = df_app['brands'].fillna("inconnue")
additives_fr : L'(information n'étant pas calculable, on remplace par une chaine caractère vide : ' '
df_app['additives_fr'] = df_app['additives_fr'].fillna(" ")
nutrition_grade_fr !!!!!!!!!!!!!!!
df_app['nutrition_grade_fr'] = df_app['nutrition_grade_fr'].fillna(0)
# df_app = df_app.fillna()
tools.get_missing_value(df_app,True,False)
Nombre total de données manquantes dans le dataframe : 193110 données manquantes sur 5526992 (3.49%) ------------------------------------------------------------- Nombre et pourcentage de valeurs manquantes par variable
| Nombres de valeurs manquantes | % de valeurs manquantes | |
|---|---|---|
| fiber_100g | 55918 | 23.270000 |
| nutrition-score-fr_100g | 36899 | 15.360000 |
| saturated-fat_100g | 29725 | 12.370000 |
| carbohydrates_100g | 18670 | 7.770000 |
| fat_100g | 18352 | 7.640000 |
| sugars_100g | 15882 | 6.610000 |
| sodium_100g | 6700 | 2.790000 |
| salt_100g | 6661 | 2.770000 |
| proteins_100g | 2689 | 1.120000 |
| energy_100g | 1614 | 0.670000 |
------------------------------------------------------------------
# export
df_app.to_csv('assets/datas/df_application.csv', sep='\t',index=False)
KNNImputer
from sklearn.impute import KNNImputer
df_app_imputer = df_app.copy()
imputer = KNNImputer( missing_values=np.nan,n_neighbors=2)
energy_100g', 'fat_100g','saturated-fat_100g
float_columns = ['energy_100g', 'fat_100g','saturated-fat_100g']
df_app_imputer[float_columns] = imputer.fit_transform(df_app_imputer[float_columns])
'carbohydrates_100g', 'sugars_100g'
float_columns =['carbohydrates_100g', 'sugars_100g']
df_app_imputer[float_columns] = imputer.fit_transform(df_app_imputer[float_columns])
'salt_100g', 'sodium_100g'
float_columns =['salt_100g', 'sodium_100g']
df_app_imputer[float_columns] = imputer.fit_transform(df_app_imputer[float_columns])
tools.get_missing_value(df_app_imputer,True,False)
Nombre total de données manquantes dans le dataframe : 95506 données manquantes sur 5526992 (1.73%) ------------------------------------------------------------- Nombre et pourcentage de valeurs manquantes par variable
| Nombres de valeurs manquantes | % de valeurs manquantes | |
|---|---|---|
| fiber_100g | 55918 | 23.270000 |
| nutrition-score-fr_100g | 36899 | 15.360000 |
| proteins_100g | 2689 | 1.120000 |
------------------------------------------------------------------
# export
df_app_imputer.to_csv('assets/datas/df_application_imputer2.csv', sep='\t',index=False)
dfimp = df_app.copy()
float_columns = ['energy_100g', 'fat_100g','saturated-fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
'proteins_100g', 'salt_100g', 'sodium_100g', 'nutrition-score-fr_100g']
dfimp[float_columns] = imputer.fit_transform(dfimp[float_columns])
tools.get_missing_value(dfimp,True,False)
Nombre total de données manquantes dans le dataframe : 0 données manquantes sur 5526992 (0.0%) ------------------------------------------------------------- Nombre et pourcentage de valeurs manquantes par variable
| Nombres de valeurs manquantes | % de valeurs manquantes |
|---|
------------------------------------------------------------------
# export
dfimp.to_csv('assets/datas/df_application_imputerAll.csv', sep='\t',index=False)